# Data manipulation and plotting modules
import numpy as np
import pandas as pd
from collections import Counter
# Data pre-processing
# z = (x-mean)/stdev
from sklearn.preprocessing import StandardScaler as ss
# Dimensionality reduction
from sklearn.decomposition import PCA
# Data splitting and model parameter search
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# Modeling modules
from xgboost.sklearn import XGBClassifier
# Model pipelining
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
# Model evaluation metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import confusion_matrix
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.offline as pyo
import plotly.express as px
import plotly.graph_objs as go
pyo.init_notebook_mode()
import plotly.figure_factory as ff
from xgboost import plot_importance
# Needed for Bayes optimization
# Takes an estimator, performs cross-validation and gives out average scorefrom sklearn.model_selection import cross_val_score
# Misc
import time
import os
import gc
import random
# Used in Randomized parameter search
from scipy.stats import uniform
data = pd.read_csv("C:\\Users\\Lenovo\\Desktop\\Loan Details.csv")
data.shape
(67463, 35)
data.head()
| ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | ... | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 65087372 | 10000 | 32236 | 12329.36286 | 59 | BAT2522922 | 11.135007 | B | C4 | MORTGAGE | ... | 2.498291 | 0.793724 | 0 | INDIVIDUAL | 49 | 0 | 31 | 311301 | 6619 | 0 |
| 1 | 1450153 | 3609 | 11940 | 12191.99692 | 59 | BAT1586599 | 12.237563 | C | D3 | RENT | ... | 2.377215 | 0.974821 | 0 | INDIVIDUAL | 109 | 0 | 53 | 182610 | 20885 | 0 |
| 2 | 1969101 | 28276 | 9311 | 21603.22455 | 59 | BAT2136391 | 12.545884 | F | D4 | MORTGAGE | ... | 4.316277 | 1.020075 | 0 | INDIVIDUAL | 66 | 0 | 34 | 89801 | 26155 | 0 |
| 3 | 6651430 | 11170 | 6954 | 17877.15585 | 59 | BAT2428731 | 16.731201 | C | C3 | MORTGAGE | ... | 0.107020 | 0.749971 | 0 | INDIVIDUAL | 39 | 0 | 40 | 9189 | 60214 | 0 |
| 4 | 14354669 | 16890 | 13226 | 13539.92667 | 59 | BAT5341619 | 15.008300 | C | D4 | MORTGAGE | ... | 1294.818751 | 0.368953 | 0 | INDIVIDUAL | 18 | 0 | 430 | 126029 | 22579 | 0 |
5 rows × 35 columns
data.nunique()
ID 67463 Loan Amount 27525 Funded Amount 24548 Funded Amount Investor 67441 Term 3 Batch Enrolled 41 Interest Rate 67448 Grade 7 Sub Grade 35 Employment Duration 3 Home Ownership 67454 Verification Status 3 Payment Plan 1 Loan Title 109 Debit to Income 67454 Delinquency - two years 9 Inquires - six months 6 Open Account 36 Public Record 5 Revolving Balance 20582 Revolving Utilities 67458 Total Accounts 69 Initial List Status 2 Total Received Interest 67451 Total Received Late Fee 67380 Recoveries 67387 Collection Recovery Fee 67313 Collection 12 months Medical 2 Application Type 2 Last week Pay 162 Accounts Delinquent 1 Total Collection Amount 2193 Total Current Balance 60901 Total Revolving Credit Limit 37708 Loan Status 2 dtype: int64
data.isnull().sum()
ID 0 Loan Amount 0 Funded Amount 0 Funded Amount Investor 0 Term 0 Batch Enrolled 0 Interest Rate 0 Grade 0 Sub Grade 0 Employment Duration 0 Home Ownership 0 Verification Status 0 Payment Plan 0 Loan Title 0 Debit to Income 0 Delinquency - two years 0 Inquires - six months 0 Open Account 0 Public Record 0 Revolving Balance 0 Revolving Utilities 0 Total Accounts 0 Initial List Status 0 Total Received Interest 0 Total Received Late Fee 0 Recoveries 0 Collection Recovery Fee 0 Collection 12 months Medical 0 Application Type 0 Last week Pay 0 Accounts Delinquent 0 Total Collection Amount 0 Total Current Balance 0 Total Revolving Credit Limit 0 Loan Status 0 dtype: int64
data.describe()
| ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Interest Rate | Home Ownership | Debit to Income | Delinquency - two years | Inquires - six months | ... | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 6.746300e+04 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | ... | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.0 | 67463.000000 | 6.746300e+04 | 67463.000000 | 67463.000000 |
| mean | 2.562761e+07 | 16848.902776 | 15770.599114 | 14621.799323 | 58.173814 | 11.846258 | 80541.502522 | 23.299241 | 0.327127 | 0.145754 | ... | 1.143969 | 59.691578 | 1.125141 | 0.021301 | 71.163260 | 0.0 | 146.467990 | 1.595739e+05 | 23123.005544 | 0.092510 |
| std | 2.109155e+07 | 8367.865726 | 8150.992662 | 6785.345170 | 3.327441 | 3.718629 | 45029.120366 | 8.451824 | 0.800888 | 0.473291 | ... | 5.244365 | 357.026346 | 3.489885 | 0.144385 | 43.315845 | 0.0 | 744.382233 | 1.390332e+05 | 20916.699999 | 0.289747 |
| min | 1.297933e+06 | 1014.000000 | 1014.000000 | 1114.590204 | 36.000000 | 5.320006 | 14573.537170 | 0.675299 | 0.000000 | 0.000000 | ... | 0.000003 | 0.000036 | 0.000036 | 0.000000 | 0.000000 | 0.0 | 1.000000 | 6.170000e+02 | 1000.000000 | 0.000000 |
| 25% | 6.570288e+06 | 10012.000000 | 9266.500000 | 9831.684984 | 58.000000 | 9.297147 | 51689.843335 | 16.756416 | 0.000000 | 0.000000 | ... | 0.021114 | 1.629818 | 0.476259 | 0.000000 | 35.000000 | 0.0 | 24.000000 | 5.037900e+04 | 8155.500000 | 0.000000 |
| 50% | 1.791565e+07 | 16073.000000 | 13042.000000 | 12793.682170 | 59.000000 | 11.377696 | 69335.832680 | 22.656658 | 0.000000 | 0.000000 | ... | 0.043398 | 3.344524 | 0.780141 | 0.000000 | 68.000000 | 0.0 | 36.000000 | 1.183690e+05 | 16733.000000 | 0.000000 |
| 75% | 4.271521e+07 | 22106.000000 | 21793.000000 | 17807.594120 | 59.000000 | 14.193533 | 94623.322785 | 30.048400 | 0.000000 | 0.000000 | ... | 0.071884 | 5.453727 | 1.070566 | 0.000000 | 105.000000 | 0.0 | 46.000000 | 2.283750e+05 | 32146.500000 | 0.000000 |
| max | 7.224578e+07 | 35000.000000 | 34999.000000 | 34999.746430 | 59.000000 | 27.182348 | 406561.536400 | 39.629862 | 8.000000 | 5.000000 | ... | 42.618882 | 4354.467419 | 166.833000 | 1.000000 | 161.000000 | 0.0 | 16421.000000 | 1.177412e+06 | 201169.000000 | 1.000000 |
8 rows × 26 columns
data.dtypes
ID int64 Loan Amount int64 Funded Amount int64 Funded Amount Investor float64 Term int64 Batch Enrolled object Interest Rate float64 Grade object Sub Grade object Employment Duration object Home Ownership float64 Verification Status object Payment Plan object Loan Title object Debit to Income float64 Delinquency - two years int64 Inquires - six months int64 Open Account int64 Public Record int64 Revolving Balance int64 Revolving Utilities float64 Total Accounts int64 Initial List Status object Total Received Interest float64 Total Received Late Fee float64 Recoveries float64 Collection Recovery Fee float64 Collection 12 months Medical int64 Application Type object Last week Pay int64 Accounts Delinquent int64 Total Collection Amount int64 Total Current Balance int64 Total Revolving Credit Limit int64 Loan Status int64 dtype: object
import seaborn as sns
ax=sns.countplot(x='Loan Status',data=data);
# Finding Correlation,
corrMatrix=data.corr()
corrMatrix
| ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Interest Rate | Home Ownership | Debit to Income | Delinquency - two years | Inquires - six months | ... | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | 1.000000 | -0.003480 | -0.003302 | 0.002954 | 0.003226 | 0.004258 | -0.004390 | -0.010578 | 0.000568 | -0.006628 | ... | 0.006674 | -0.001435 | -0.001802 | -0.002900 | 0.000907 | NaN | 0.003745 | -0.003572 | -0.005141 | 0.000472 |
| Loan Amount | -0.003480 | 1.000000 | -0.000551 | 0.002831 | 0.004277 | -0.004888 | 0.016691 | 0.007959 | -0.000469 | 0.008962 | ... | -0.000034 | -0.001606 | -0.002142 | -0.002726 | -0.002362 | NaN | -0.004135 | -0.008285 | 0.002289 | -0.004473 |
| Funded Amount | -0.003302 | -0.000551 | 1.000000 | 0.010227 | -0.001503 | 0.002310 | -0.003518 | 0.002347 | 0.011313 | -0.001587 | ... | 0.001542 | 0.000462 | 0.000175 | 0.001071 | -0.003476 | NaN | -0.002821 | -0.001499 | 0.006145 | 0.001364 |
| Funded Amount Investor | 0.002954 | 0.002831 | 0.010227 | 1.000000 | -0.008943 | -0.001917 | 0.001339 | 0.000112 | 0.001925 | -0.003073 | ... | -0.000232 | 0.000966 | -0.007272 | 0.001814 | 0.004248 | NaN | 0.006862 | 0.003283 | 0.005669 | -0.000091 |
| Term | 0.003226 | 0.004277 | -0.001503 | -0.008943 | 1.000000 | -0.012688 | -0.021813 | 0.001026 | -0.004494 | -0.005272 | ... | -0.000736 | -0.003475 | -0.001203 | -0.003263 | 0.007035 | NaN | 0.000358 | 0.003361 | -0.005068 | -0.003410 |
| Interest Rate | 0.004258 | -0.004888 | 0.002310 | -0.001917 | -0.012688 | 1.000000 | 0.005467 | -0.011203 | 0.004045 | 0.009172 | ... | 0.003119 | 0.009348 | 0.001281 | -0.009895 | -0.012652 | NaN | 0.002771 | -0.002567 | 0.016651 | 0.002900 |
| Home Ownership | -0.004390 | 0.016691 | -0.003518 | 0.001339 | -0.021813 | 0.005467 | 1.000000 | 0.022781 | -0.003793 | 0.005024 | ... | 0.004011 | 0.004399 | -0.003821 | -0.002639 | -0.004573 | NaN | 0.006314 | 0.007117 | 0.005008 | 0.003716 |
| Debit to Income | -0.010578 | 0.007959 | 0.002347 | 0.000112 | 0.001026 | -0.011203 | 0.022781 | 1.000000 | -0.004007 | 0.002246 | ... | -0.010224 | -0.009693 | 0.002200 | -0.000716 | 0.008639 | NaN | 0.001555 | -0.011582 | -0.007236 | -0.003057 |
| Delinquency - two years | 0.000568 | -0.000469 | 0.011313 | 0.001925 | -0.004494 | 0.004045 | -0.003793 | -0.004007 | 1.000000 | 0.014679 | ... | 0.007943 | 0.017348 | 0.002707 | 0.003451 | 0.001160 | NaN | 0.005642 | 0.002602 | 0.009315 | 0.009990 |
| Inquires - six months | -0.006628 | 0.008962 | -0.001587 | -0.003073 | -0.005272 | 0.009172 | 0.005024 | 0.002246 | 0.014679 | 1.000000 | ... | 0.008296 | 0.012487 | 0.008388 | -0.004436 | -0.004453 | NaN | -0.002210 | 0.001531 | 0.004678 | 0.000578 |
| Open Account | -0.002781 | 0.009088 | 0.005755 | -0.007850 | 0.021362 | -0.003250 | 0.009080 | 0.001100 | 0.004904 | -0.002109 | ... | 0.000828 | -0.001216 | -0.006832 | 0.003397 | -0.013415 | NaN | -0.006176 | -0.007470 | 0.003096 | -0.007073 |
| Public Record | 0.009830 | -0.002542 | 0.003750 | 0.005002 | -0.002827 | 0.006979 | 0.005631 | -0.007813 | 0.006716 | 0.004087 | ... | 0.016594 | 0.008905 | 0.004725 | 0.008878 | 0.004281 | NaN | 0.012928 | 0.003935 | 0.012046 | 0.010590 |
| Revolving Balance | 0.004315 | -0.001738 | -0.004485 | -0.009102 | -0.002317 | 0.018999 | 0.016783 | -0.011414 | 0.009394 | 0.002457 | ... | 0.004903 | 0.005056 | -0.003939 | 0.007516 | -0.010279 | NaN | 0.004282 | -0.007537 | 0.023366 | -0.001073 |
| Revolving Utilities | 0.000910 | 0.014828 | 0.004460 | -0.003027 | -0.010018 | 0.006089 | -0.005556 | 0.003691 | 0.002474 | 0.005150 | ... | -0.001363 | -0.002381 | -0.001952 | 0.010783 | 0.007961 | NaN | 0.006067 | -0.019785 | -0.009818 | 0.004120 |
| Total Accounts | 0.001518 | -0.002071 | 0.008298 | 0.003191 | 0.001204 | 0.006584 | 0.021452 | -0.005683 | -0.003085 | 0.007562 | ... | 0.004910 | -0.002171 | 0.007701 | 0.002538 | 0.015405 | NaN | 0.005593 | -0.006937 | 0.032492 | 0.000222 |
| Total Received Interest | -0.002938 | -0.001887 | 0.002759 | 0.001432 | 0.008663 | 0.006998 | -0.010346 | 0.006504 | -0.004511 | 0.009556 | ... | 0.002507 | -0.000717 | 0.003921 | 0.004759 | 0.002643 | NaN | 0.001027 | 0.001374 | 0.012015 | 0.001680 |
| Total Received Late Fee | 0.006674 | -0.000034 | 0.001542 | -0.000232 | -0.000736 | 0.003119 | 0.004011 | -0.010224 | 0.007943 | 0.008296 | ... | 1.000000 | 0.007992 | 0.004856 | 0.003590 | 0.001776 | NaN | 0.007441 | -0.000526 | 0.014839 | 0.009365 |
| Recoveries | -0.001435 | -0.001606 | 0.000462 | 0.000966 | -0.003475 | 0.009348 | 0.004399 | -0.009693 | 0.017348 | 0.012487 | ... | 0.007992 | 1.000000 | 0.008328 | 0.007874 | -0.001787 | NaN | 0.004703 | -0.000488 | -0.000146 | -0.000652 |
| Collection Recovery Fee | -0.001802 | -0.002142 | 0.000175 | -0.007272 | -0.001203 | 0.001281 | -0.003821 | 0.002200 | 0.002707 | 0.008388 | ... | 0.004856 | 0.008328 | 1.000000 | 0.007689 | 0.001106 | NaN | 0.001800 | 0.002851 | -0.000208 | -0.003828 |
| Collection 12 months Medical | -0.002900 | -0.002726 | 0.001071 | 0.001814 | -0.003263 | -0.009895 | -0.002639 | -0.000716 | 0.003451 | -0.004436 | ... | 0.003590 | 0.007874 | 0.007689 | 1.000000 | 0.005540 | NaN | 0.004768 | 0.001728 | -0.001033 | -0.000686 |
| Last week Pay | 0.000907 | -0.002362 | -0.003476 | 0.004248 | 0.007035 | -0.012652 | -0.004573 | 0.008639 | 0.001160 | -0.004453 | ... | 0.001776 | -0.001787 | 0.001106 | 0.005540 | 1.000000 | NaN | 0.001457 | 0.001147 | -0.001583 | 0.006117 |
| Accounts Delinquent | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Total Collection Amount | 0.003745 | -0.004135 | -0.002821 | 0.006862 | 0.000358 | 0.002771 | 0.006314 | 0.001555 | 0.005642 | -0.002210 | ... | 0.007441 | 0.004703 | 0.001800 | 0.004768 | 0.001457 | NaN | 1.000000 | 0.005318 | 0.006541 | 0.007892 |
| Total Current Balance | -0.003572 | -0.008285 | -0.001499 | 0.003283 | 0.003361 | -0.002567 | 0.007117 | -0.011582 | 0.002602 | 0.001531 | ... | -0.000526 | -0.000488 | 0.002851 | 0.001728 | 0.001147 | NaN | 0.005318 | 1.000000 | 0.004865 | 0.009828 |
| Total Revolving Credit Limit | -0.005141 | 0.002289 | 0.006145 | 0.005669 | -0.005068 | 0.016651 | 0.005008 | -0.007236 | 0.009315 | 0.004678 | ... | 0.014839 | -0.000146 | -0.000208 | -0.001033 | -0.001583 | NaN | 0.006541 | 0.004865 | 1.000000 | 0.001454 |
| Loan Status | 0.000472 | -0.004473 | 0.001364 | -0.000091 | -0.003410 | 0.002900 | 0.003716 | -0.003057 | 0.009990 | 0.000578 | ... | 0.009365 | -0.000652 | -0.003828 | -0.000686 | 0.006117 | NaN | 0.007892 | 0.009828 | 0.001454 | 1.000000 |
26 rows × 26 columns
ax= plt.axes()
_=sns.heatmap(data.drop(columns=['Total Current Balance']).corr(),cmap='Blues',cbar=None,ax=ax)
_=ax.set_title('Correlation Heatmap')
plt.figure(figsize=(5,5))
a = sns.barplot(x="Grade",y="Total Collection Amount",data=data)
data["Loan_qty"] = pd.qcut(
data['Loan Amount'],
q = 3,
labels= ["low", "medium", "high"]
)
data.Loan_qty.value_counts()
low 22489 medium 22488 high 22486 Name: Loan_qty, dtype: int64
data.drop(labels='ID', axis=1, inplace=True)
data.drop(labels='Batch Enrolled', axis=1, inplace=True)
data.drop(['Debit to Income','Application Type'], axis=1, inplace=True)
data.head()
| Loan Amount | Funded Amount | Funded Amount Investor | Term | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | ... | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Loan_qty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10000 | 32236 | 12329.36286 | 59 | 11.135007 | B | C4 | MORTGAGE | 176346.62670 | Not Verified | ... | 2.498291 | 0.793724 | 0 | 49 | 0 | 31 | 311301 | 6619 | 0 | low |
| 1 | 3609 | 11940 | 12191.99692 | 59 | 12.237563 | C | D3 | RENT | 39833.92100 | Source Verified | ... | 2.377215 | 0.974821 | 0 | 109 | 0 | 53 | 182610 | 20885 | 0 | low |
| 2 | 28276 | 9311 | 21603.22455 | 59 | 12.545884 | F | D4 | MORTGAGE | 91506.69105 | Source Verified | ... | 4.316277 | 1.020075 | 0 | 66 | 0 | 34 | 89801 | 26155 | 0 | high |
| 3 | 11170 | 6954 | 17877.15585 | 59 | 16.731201 | C | C3 | MORTGAGE | 108286.57590 | Source Verified | ... | 0.107020 | 0.749971 | 0 | 39 | 0 | 40 | 9189 | 60214 | 0 | low |
| 4 | 16890 | 13226 | 13539.92667 | 59 | 15.008300 | C | D4 | MORTGAGE | 44234.82545 | Source Verified | ... | 1294.818751 | 0.368953 | 0 | 18 | 0 | 430 | 126029 | 22579 | 0 | medium |
5 rows × 32 columns
# Data pre-processing
# Data splitting and model parameter search
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
# Hyperparameter optimization
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib
import warnings
warnings.filterwarnings("ignore")
# Classifiers
from catboost import CatBoostClassifier
# Model selection
from sklearn.model_selection import StratifiedKFold
# Metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer
from time import time
# z = (x-mean)/stdev
from sklearn.preprocessing import StandardScaler as ss
# Dimensionality reduction and noise removal
from sklearn.decomposition import PCA
from xgboost.sklearn import XGBClassifier
# Model pipelining
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
# Hyperparameter optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# For plotting
import matplotlib.pyplot as plt
# Model evaluation metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import confusion_matrix
# From a cell display outputs from multiple commands:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# To check train data file total columns and rows
print("\n Shape")
data.shape # 67463, 33)
#To check total columns
print("\n\n Columns")
data.columns.values
print("\n")
#To check Data types
print("\n\nData types")
data.dtypes.value_counts()
print("\n")
data.head(3)
Shape
(67463, 32)
Columns
array(['Loan Amount', 'Funded Amount', 'Funded Amount Investor', 'Term',
'Interest Rate', 'Grade', 'Sub Grade', 'Employment Duration',
'Home Ownership', 'Verification Status', 'Payment Plan',
'Loan Title', 'Delinquency - two years', 'Inquires - six months',
'Open Account', 'Public Record', 'Revolving Balance',
'Revolving Utilities', 'Total Accounts', 'Initial List Status',
'Total Received Interest', 'Total Received Late Fee', 'Recoveries',
'Collection Recovery Fee', 'Collection 12 months Medical',
'Last week Pay', 'Accounts Delinquent', 'Total Collection Amount',
'Total Current Balance', 'Total Revolving Credit Limit',
'Loan Status', 'Loan_qty'], dtype=object)
Data types
int64 16 float64 8 object 7 category 1 dtype: int64
| Loan Amount | Funded Amount | Funded Amount Investor | Term | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | ... | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Loan_qty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10000 | 32236 | 12329.36286 | 59 | 11.135007 | B | C4 | MORTGAGE | 176346.62670 | Not Verified | ... | 2.498291 | 0.793724 | 0 | 49 | 0 | 31 | 311301 | 6619 | 0 | low |
| 1 | 3609 | 11940 | 12191.99692 | 59 | 12.237563 | C | D3 | RENT | 39833.92100 | Source Verified | ... | 2.377215 | 0.974821 | 0 | 109 | 0 | 53 | 182610 | 20885 | 0 | low |
| 2 | 28276 | 9311 | 21603.22455 | 59 | 12.545884 | F | D4 | MORTGAGE | 91506.69105 | Source Verified | ... | 4.316277 | 1.020075 | 0 | 66 | 0 | 34 | 89801 | 26155 | 0 | high |
3 rows × 32 columns
cat = ['Grade','Sub Grade','Employment Duration','Verification Status','Payment Plan','Loan Title','Initial List Status','Loan_qty']
# Import label encoder
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data[cat]=data[cat].apply(label_encoder.fit_transform)
data.head()
| Loan Amount | Funded Amount | Funded Amount Investor | Term | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | ... | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Loan_qty | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10000 | 32236 | 12329.36286 | 59 | 11.135007 | 1 | 13 | 0 | 176346.62670 | 0 | ... | 2.498291 | 0.793724 | 0 | 49 | 0 | 31 | 311301 | 6619 | 0 | 1 |
| 1 | 3609 | 11940 | 12191.99692 | 59 | 12.237563 | 2 | 17 | 2 | 39833.92100 | 1 | ... | 2.377215 | 0.974821 | 0 | 109 | 0 | 53 | 182610 | 20885 | 0 | 1 |
| 2 | 28276 | 9311 | 21603.22455 | 59 | 12.545884 | 5 | 18 | 0 | 91506.69105 | 1 | ... | 4.316277 | 1.020075 | 0 | 66 | 0 | 34 | 89801 | 26155 | 0 | 0 |
| 3 | 11170 | 6954 | 17877.15585 | 59 | 16.731201 | 2 | 12 | 0 | 108286.57590 | 1 | ... | 0.107020 | 0.749971 | 0 | 39 | 0 | 40 | 9189 | 60214 | 0 | 1 |
| 4 | 16890 | 13226 | 13539.92667 | 59 | 15.008300 | 2 | 18 | 0 | 44234.82545 | 1 | ... | 1294.818751 | 0.368953 | 0 | 18 | 0 | 430 | 126029 | 22579 | 0 | 2 |
5 rows × 32 columns
# Divide data into predictors and target
# First 10 columns are predictors
X = data.iloc[ :, 0:30]
X.head(2)
# 11st column is target
print("\n\nTarget,y, values")
y = data.iloc[ : , 30]
y.head()
| Loan Amount | Funded Amount | Funded Amount Investor | Term | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | ... | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10000 | 32236 | 12329.36286 | 59 | 11.135007 | 1 | 13 | 0 | 176346.6267 | 0 | ... | 2929.646315 | 0.102055 | 2.498291 | 0.793724 | 0 | 49 | 0 | 31 | 311301 | 6619 |
| 1 | 3609 | 11940 | 12191.99692 | 59 | 12.237563 | 2 | 17 | 2 | 39833.9210 | 1 | ... | 772.769385 | 0.036181 | 2.377215 | 0.974821 | 0 | 109 | 0 | 53 | 182610 | 20885 |
2 rows × 30 columns
Target,y, values
0 0 1 0 2 0 3 0 4 0 Name: Loan Status, dtype: int64
# Split dataset into train and validation parts
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.20,
shuffle = True,
stratify = y
)
X_train.shape
X_test.shape
y_train.shape
y_test.shape
(53970, 30)
(13493, 30)
(53970,)
(13493,)
#Creating pipeline
#Pipe using XGBoost
steps_xg = [('sts', ss() ),
('pca', PCA()),
('xg', XGBClassifier(silent = False,
n_jobs=3) # Specify other parameters here
)
]
# Instantiate Pipeline object
pipe_xg = Pipeline(steps_xg)
# Grid Search code to discover best pipeline parameters
print("\n\n--Which parameters can be tuned?--\n\n")
pipe_xg.get_params()
--Which parameters can be tuned?--
{'memory': None,
'steps': [('sts', StandardScaler()),
('pca', PCA()),
('xg',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=3,
num_parallel_tree=None, predictor=None, random_state=None,
reg_alpha=None, reg_lambda=None, ...))],
'verbose': False,
'sts': StandardScaler(),
'pca': PCA(),
'xg': XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=3,
num_parallel_tree=None, predictor=None, random_state=None,
reg_alpha=None, reg_lambda=None, ...),
'sts__copy': True,
'sts__with_mean': True,
'sts__with_std': True,
'pca__copy': True,
'pca__iterated_power': 'auto',
'pca__n_components': None,
'pca__n_oversamples': 10,
'pca__power_iteration_normalizer': 'auto',
'pca__random_state': None,
'pca__svd_solver': 'auto',
'pca__tol': 0.0,
'pca__whiten': False,
'xg__objective': 'binary:logistic',
'xg__use_label_encoder': False,
'xg__base_score': None,
'xg__booster': None,
'xg__callbacks': None,
'xg__colsample_bylevel': None,
'xg__colsample_bynode': None,
'xg__colsample_bytree': None,
'xg__early_stopping_rounds': None,
'xg__enable_categorical': False,
'xg__eval_metric': None,
'xg__gamma': None,
'xg__gpu_id': None,
'xg__grow_policy': None,
'xg__importance_type': None,
'xg__interaction_constraints': None,
'xg__learning_rate': None,
'xg__max_bin': None,
'xg__max_cat_to_onehot': None,
'xg__max_delta_step': None,
'xg__max_depth': None,
'xg__max_leaves': None,
'xg__min_child_weight': None,
'xg__missing': nan,
'xg__monotone_constraints': None,
'xg__n_estimators': 100,
'xg__n_jobs': 3,
'xg__num_parallel_tree': None,
'xg__predictor': None,
'xg__random_state': None,
'xg__reg_alpha': None,
'xg__reg_lambda': None,
'xg__sampling_method': None,
'xg__scale_pos_weight': None,
'xg__subsample': None,
'xg__tree_method': None,
'xg__validate_parameters': None,
'xg__verbosity': None,
'xg__silent': False}
# Deifne dictionary
parameters = {'xg__learning_rate': [0.03, 0.05], # learning rate decides what percentage
# of error is to be fitted by
# by next boosted tree.
# See this answer in stackoverflow:
# https://stats.stackexchange.com/questions/354484/why-does-xgboost-have-a-learning-rate
# Coefficients of boosted trees decide,
# in the overall model or scheme, how much importance
# each boosted tree shall have. Values of these
# Coefficients are calculated by modeling
# algorithm and unlike learning rate are
# not hyperparameters. These Coefficients
# get adjusted by l1 and l2 parameters
'xg__n_estimators': [50, 100], # Number of boosted trees to fit
# l1 and l2 specifications will change
# the values of coeff of boosted trees
# but not their numbers
'xg__max_depth': [10,6],
'pca__n_components' : [30,5],
'xg__booster': ['gbtree','gblinear']
}
# Define revised dictionary
parameters = {'xg__learning_rate': [0.03, 0.05], # learning rate decides what percentage
# of error is to be fitted by
# by next boosted tree.
# See this answer in stackoverflow:
# https://stats.stackexchange.com/questions/354484/why-does-xgboost-have-a-learning-rate
# Coefficients of boosted trees decide,
# in the overall model or scheme, how much importance
# each boosted tree shall have. Values of these
# Coefficients are calculated by modeling
# algorithm and unlike learning rate are
# not hyperparameters. These Coefficients
# get adjusted by l1 and l2 parameters
'xg__max_depth': [10,6],
'pca__n_components' : [30,5],
}
#### Instantiating GridSearchCV class
clf = GridSearchCV(pipe_xg, # pipeline object
parameters, # possible parameters
n_jobs = 2, # USe parallel cpu threads
cv =2 , # No of folds
verbose =1, # Higher the value, more the verbosity
scoring = ['accuracy', 'roc_auc'], # Metrics for performance
refit = 'roc_auc' # Refitting final model on what parameters?
# Those which maximise auc
)
from time import sleep
from time import * #meaning from time import EVERYTHING
import time
# Start fitting pipeline to data
print("\n\n--Takes time...---\n")
start = time.time()
clf.fit(X_train, y_train)
end = time.time()
print()
(end - start)/60
--Takes time...---
Fitting 2 folds for each of 8 candidates, totalling 16 fits
[22:33:41] WARNING: C:/Users/administrator/workspace/xgboost-win64_release_1.6.0/src/learner.cc:627:
Parameters: { "silent" } might not be used.
This could be a false alarm, with some parameters getting used by language bindings but
then being mistakenly passed down to XGBoost core, or some parameter actually being used
but getting flagged wrongly here. Please open an issue if you find any such cases.
GridSearchCV(cv=2,
estimator=Pipeline(steps=[('sts', StandardScaler()),
('pca', PCA()),
('xg',
XGBClassifier(base_score=None,
booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
gamma=None, gpu_id=None,
grow_policy=None,
importance_type=None,
intera...
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100,
n_jobs=3,
num_parallel_tree=None,
predictor=None,
random_state=None,
reg_alpha=None,
reg_lambda=None, ...))]),
n_jobs=2,
param_grid={'pca__n_components': [30, 5],
'xg__learning_rate': [0.03, 0.05],
'xg__max_depth': [10, 6]},
refit='roc_auc', scoring=['accuracy', 'roc_auc'], verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=2,
estimator=Pipeline(steps=[('sts', StandardScaler()),
('pca', PCA()),
('xg',
XGBClassifier(base_score=None,
booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
gamma=None, gpu_id=None,
grow_policy=None,
importance_type=None,
intera...
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100,
n_jobs=3,
num_parallel_tree=None,
predictor=None,
random_state=None,
reg_alpha=None,
reg_lambda=None, ...))]),
n_jobs=2,
param_grid={'pca__n_components': [30, 5],
'xg__learning_rate': [0.03, 0.05],
'xg__max_depth': [10, 6]},
refit='roc_auc', scoring=['accuracy', 'roc_auc'], verbose=1)Pipeline(steps=[('sts', StandardScaler()), ('pca', PCA()),
('xg',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
gamma=None, gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=3,
num_parallel_tree=None, predictor=None,
random_state=None, reg_alpha=None,
reg_lambda=None, ...))])StandardScaler()
PCA()
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=3,
num_parallel_tree=None, predictor=None, random_state=None,
reg_alpha=None, reg_lambda=None, ...)2.2357513189315794
# Make predictions using the best returned model
y_pred = clf.predict(X_test)
# 7.5 Accuracy score
print("\n\n--Accuracy Score--\n")
accuracy = accuracy_score(y_test, y_pred)
accuracy * 100
--Accuracy Score--
90.7507596531535
# F1 score
print("\n\n--F1 Score ")
f1_score(y_test,y_pred,average ='micro')
# 7.8 ROC curve and get AUC
print("\n\n--ROC curve--\n")
_=plot_roc_curve(clf, X_test, y_test)
--F1 Score
0.9075075965315349
--ROC curve--
# Get feature importances from GridSearchCV best fitted 'xg' model
clf.best_estimator_.named_steps["xg"].feature_importances_.shape
print("\n\n---Feature importances---\n")
clf.best_estimator_.named_steps["xg"].feature_importances_
(30,)
---Feature importances---
array([0.03265371, 0.03472751, 0.0308095 , 0.03223895, 0.03245703,
0.03357385, 0.03266273, 0.03215893, 0.03352813, 0.03415863,
0.03453534, 0.03328626, 0.03297805, 0.03232114, 0.03327659,
0.03340273, 0.03570846, 0.03412129, 0.03340523, 0.03336824,
0.03346148, 0.03315647, 0.03417563, 0.03429782, 0.03236499,
0.03339149, 0.03259033, 0.03415657, 0.03439463, 0.0326382 ],
dtype=float32)
colnames = X.columns.tolist()
# Create a dataframe of feature importances
# with names of columns and sorted by feature-imp
imp_values = clf.best_estimator_.named_steps["xg"].feature_importances_
df_imp = pd.DataFrame(
data = imp_values,
index = colnames,
columns = ["imp"]
).sort_values(by = 'imp')
df_imp
| imp | |
|---|---|
| Funded Amount Investor | 0.030810 |
| Employment Duration | 0.032159 |
| Term | 0.032239 |
| Inquires - six months | 0.032321 |
| Collection 12 months Medical | 0.032365 |
| Interest Rate | 0.032457 |
| Accounts Delinquent | 0.032590 |
| Total Revolving Credit Limit | 0.032638 |
| Loan Amount | 0.032654 |
| Sub Grade | 0.032663 |
| Delinquency - two years | 0.032978 |
| Total Received Late Fee | 0.033156 |
| Open Account | 0.033277 |
| Loan Title | 0.033286 |
| Initial List Status | 0.033368 |
| Last week Pay | 0.033391 |
| Public Record | 0.033403 |
| Total Accounts | 0.033405 |
| Total Received Interest | 0.033461 |
| Home Ownership | 0.033528 |
| Grade | 0.033574 |
| Revolving Utilities | 0.034121 |
| Total Collection Amount | 0.034157 |
| Verification Status | 0.034159 |
| Recoveries | 0.034176 |
| Collection Recovery Fee | 0.034298 |
| Total Current Balance | 0.034395 |
| Payment Plan | 0.034535 |
| Funded Amount | 0.034728 |
| Revolving Balance | 0.035708 |
# First five columns with least feature importance are:
list(df_imp.index.values[:5])
['Funded Amount Investor', 'Employment Duration', 'Term', 'Inquires - six months', 'Collection 12 months Medical']
# Let us drop these from X_train and X_test
Xtrain = X_train.drop(columns = list(df_imp.index.values[:5]))
Xtest = X_test.drop(columns = list(df_imp.index.values[:5]))
# Build model again with reduced dataset
clf_dr = GridSearchCV(pipe_xg, # pipeline object
parameters, # possible parameters
n_jobs = 2, # USe parallel cpu threads
cv =3 , # No of folds
verbose =2, # Higher the value, more the verbosity
scoring = ['accuracy', 'roc_auc'], # Metrics for performance
refit = 'roc_auc' # Those which maximise auc
)
start = time.time()
clf_dr.fit(Xtrain, y_train)
end = time.time()
(end - start)/60
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[22:34:56] WARNING: C:/Users/administrator/workspace/xgboost-win64_release_1.6.0/src/learner.cc:627:
Parameters: { "silent" } might not be used.
This could be a false alarm, with some parameters getting used by language bindings but
then being mistakenly passed down to XGBoost core, or some parameter actually being used
but getting flagged wrongly here. Please open an issue if you find any such cases.
GridSearchCV(cv=3,
estimator=Pipeline(steps=[('sts', StandardScaler()),
('pca', PCA()),
('xg',
XGBClassifier(base_score=None,
booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
gamma=None, gpu_id=None,
grow_policy=None,
importance_type=None,
intera...
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100,
n_jobs=3,
num_parallel_tree=None,
predictor=None,
random_state=None,
reg_alpha=None,
reg_lambda=None, ...))]),
n_jobs=2,
param_grid={'pca__n_components': [30, 5],
'xg__learning_rate': [0.03, 0.05],
'xg__max_depth': [10, 6]},
refit='roc_auc', scoring=['accuracy', 'roc_auc'], verbose=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=3,
estimator=Pipeline(steps=[('sts', StandardScaler()),
('pca', PCA()),
('xg',
XGBClassifier(base_score=None,
booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
gamma=None, gpu_id=None,
grow_policy=None,
importance_type=None,
intera...
max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
n_estimators=100,
n_jobs=3,
num_parallel_tree=None,
predictor=None,
random_state=None,
reg_alpha=None,
reg_lambda=None, ...))]),
n_jobs=2,
param_grid={'pca__n_components': [30, 5],
'xg__learning_rate': [0.03, 0.05],
'xg__max_depth': [10, 6]},
refit='roc_auc', scoring=['accuracy', 'roc_auc'], verbose=2)Pipeline(steps=[('sts', StandardScaler()), ('pca', PCA()),
('xg',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
gamma=None, gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=3,
num_parallel_tree=None, predictor=None,
random_state=None, reg_alpha=None,
reg_lambda=None, ...))])StandardScaler()
PCA()
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=3,
num_parallel_tree=None, predictor=None, random_state=None,
reg_alpha=None, reg_lambda=None, ...)0.8580585439999898
# Make predictions
y_pred_dr = clf_dr.predict(Xtest)
# F1 score
f1_score(y_test,y_pred_dr, average ='micro')
f1_score(y_test,y_pred, average ='micro')
0.9075075965315349
0.9075075965315349
import os
import gc
import random
# Used in Randomized parameter search
from scipy.stats import uniform
##################### EE. Randomized Search #################
# Tune parameters using randomized search
# Hyperparameters to tune and their ranges
parameters = {'xg__learning_rate': uniform(0, 1),
'xg__n_estimators': range(50,300),
'xg__max_depth': range(3,10),
'pca__n_components' : range(8,10)}
# Tune parameters using random search
# Create the object first
rs = RandomizedSearchCV(
pipe_xg,
param_distributions=parameters,
scoring= ['roc_auc', 'accuracy'],
n_iter=4, # Max combination of
# parameter to try. Default = 10
verbose = 1,
refit = 'roc_auc',
n_jobs = 2, # Use parallel cpu threads
cv = 2 # No of folds.
# So n_iter * cv combinations
)
start = time.time()
rs.fit(X_train, y_train)
end = time.time()
print()
(end - start)/60
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[22:38:56] WARNING: C:/Users/administrator/workspace/xgboost-win64_release_1.6.0/src/learner.cc:627:
Parameters: { "silent" } might not be used.
This could be a false alarm, with some parameters getting used by language bindings but
then being mistakenly passed down to XGBoost core, or some parameter actually being used
but getting flagged wrongly here. Please open an issue if you find any such cases.
RandomizedSearchCV(cv=2,
estimator=Pipeline(steps=[('sts', StandardScaler()),
('pca', PCA()),
('xg',
XGBClassifier(base_score=None,
booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
gamma=None,
gpu_id=None,
grow_policy=None,
importance_type=None,...
predictor=None,
random_state=None,
reg_alpha=None,
reg_lambda=None, ...))]),
n_iter=4, n_jobs=2,
param_distributions={'pca__n_components': range(8, 10),
'xg__learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F04419DA90>,
'xg__max_depth': range(3, 10),
'xg__n_estimators': range(50, 300)},
refit='roc_auc', scoring=['roc_auc', 'accuracy'], verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(cv=2,
estimator=Pipeline(steps=[('sts', StandardScaler()),
('pca', PCA()),
('xg',
XGBClassifier(base_score=None,
booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric=None,
gamma=None,
gpu_id=None,
grow_policy=None,
importance_type=None,...
predictor=None,
random_state=None,
reg_alpha=None,
reg_lambda=None, ...))]),
n_iter=4, n_jobs=2,
param_distributions={'pca__n_components': range(8, 10),
'xg__learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001F04419DA90>,
'xg__max_depth': range(3, 10),
'xg__n_estimators': range(50, 300)},
refit='roc_auc', scoring=['roc_auc', 'accuracy'], verbose=1)Pipeline(steps=[('sts', StandardScaler()), ('pca', PCA()),
('xg',
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
gamma=None, gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=None,
max_bin=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=3,
num_parallel_tree=None, predictor=None,
random_state=None, reg_alpha=None,
reg_lambda=None, ...))])StandardScaler()
PCA()
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=3,
num_parallel_tree=None, predictor=None, random_state=None,
reg_alpha=None, reg_lambda=None, ...)0.701155138015747
# Evaluate
f"Best score: {rs.best_score_} " ;print()
f"Best parameter set: {rs.best_params_} " ; print()
# Make predictions from the best returned model
y_pred = rs.predict(X_test)
# Accuracy and f1_score
accuracy = accuracy_score(y_test, y_pred)
f"Accuracy: {accuracy * 100.0}" ; print()
f"f1 score: {f1_score(y_test,y_pred,average ='micro') }" ; print()
'Best score: 0.501027006832063 '
"Best parameter set: {'pca__n_components': 9, 'xg__learning_rate': 0.711485982806732, 'xg__max_depth': 8, 'xg__n_estimators': 121} "
'Accuracy: 89.42414585340546'
'f1 score: 0.8942414585340547'
# Cross-validation is a more reliable validation technique than just one train/test split.
# Here we'll resort to ShuffleSplit to create 5 70%/30% splits
from sklearn.model_selection import ShuffleSplit, KFold
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=17)
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
#We'll train 2 versions of the RandomForestClassifier model - first with default capacity (trees are not limited in depth),
#second - with min_samples_leaf=3, i.e. each leave is obliged to have at least 3 instances.
#%%time
model_rf1 = RandomForestClassifier(n_estimators=100, n_jobs=4,
max_depth=None, random_state=17)
# calcuate ROC-AUC for each split
cv_scores_rf1 = cross_val_score(model_rf1, X, y, cv=cv, scoring='roc_auc')
model_rf2 = RandomForestClassifier(n_estimators=100, n_jobs=4,
min_samples_leaf=3, random_state=17)
cv_scores_rf2 = cross_val_score(model_rf2, X, y, cv=cv,
scoring='roc_auc', n_jobs=-1)
#The result returned by cross_val_score is an array with metric values (ROC-AUC) for each split:
cv_scores_rf1
array([0.51963069, 0.50549526, 0.51712541, 0.52198077, 0.51389309])
cv_scores_rf2
array([0.52748311, 0.51399317, 0.52109925, 0.51319326, 0.51747239])
#compare average ROC-AUC among all splits for both models.
print('Model 1 mean score:', cv_scores_rf1.mean())
print('Model 2 mean score:', cv_scores_rf2.mean())
Model 1 mean score: 0.5156250436555302 Model 2 mean score: 0.5186482383475146
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv("C:\\Users\\Lenovo\\Desktop\\Loan Details.csv")
# Columns in num_data that are either discrete (with few levels)
# or numeric
cols=['Loan Amount','Funded Amount','Total Accounts','Total Collection Amount','Total Current Balance','Total Revolving Credit Limit','Loan Status','Funded Amount Investor','Interest Rate','Recoveries','Collection Recovery Fee','Accounts Delinquent']
# Create an instance of StandardScaler object
ss = StandardScaler()
le= LabelEncoder()
# Create copy of DataFrame
df['Loan Status']= le.fit_transform(df['Loan Status']) # Transform boolean to integer
df
| ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | ... | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 65087372 | 10000 | 32236 | 12329.36286 | 59 | BAT2522922 | 11.135007 | B | C4 | MORTGAGE | ... | 2.498291 | 0.793724 | 0 | INDIVIDUAL | 49 | 0 | 31 | 311301 | 6619 | 0 |
| 1 | 1450153 | 3609 | 11940 | 12191.99692 | 59 | BAT1586599 | 12.237563 | C | D3 | RENT | ... | 2.377215 | 0.974821 | 0 | INDIVIDUAL | 109 | 0 | 53 | 182610 | 20885 | 0 |
| 2 | 1969101 | 28276 | 9311 | 21603.22455 | 59 | BAT2136391 | 12.545884 | F | D4 | MORTGAGE | ... | 4.316277 | 1.020075 | 0 | INDIVIDUAL | 66 | 0 | 34 | 89801 | 26155 | 0 |
| 3 | 6651430 | 11170 | 6954 | 17877.15585 | 59 | BAT2428731 | 16.731201 | C | C3 | MORTGAGE | ... | 0.107020 | 0.749971 | 0 | INDIVIDUAL | 39 | 0 | 40 | 9189 | 60214 | 0 |
| 4 | 14354669 | 16890 | 13226 | 13539.92667 | 59 | BAT5341619 | 15.008300 | C | D4 | MORTGAGE | ... | 1294.818751 | 0.368953 | 0 | INDIVIDUAL | 18 | 0 | 430 | 126029 | 22579 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 67458 | 16164945 | 13601 | 6848 | 13175.28583 | 59 | BAT3193689 | 9.408858 | C | A4 | MORTGAGE | ... | 564.614852 | 0.865230 | 0 | INDIVIDUAL | 69 | 0 | 48 | 181775 | 34301 | 1 |
| 67459 | 35182714 | 8323 | 11046 | 15637.46301 | 59 | BAT1780517 | 9.972104 | C | B3 | RENT | ... | 2.015494 | 1.403368 | 0 | INDIVIDUAL | 14 | 0 | 37 | 22692 | 8714 | 0 |
| 67460 | 16435904 | 15897 | 32921 | 12329.45775 | 59 | BAT1761981 | 19.650943 | A | F3 | MORTGAGE | ... | 5.673092 | 1.607093 | 0 | INDIVIDUAL | 137 | 0 | 17 | 176857 | 42330 | 0 |
| 67461 | 5300325 | 16567 | 4975 | 21353.68465 | 59 | BAT2333412 | 13.169095 | D | E3 | OWN | ... | 1.157454 | 0.207608 | 0 | INDIVIDUAL | 73 | 0 | 61 | 361339 | 39075 | 0 |
| 67462 | 65443173 | 15353 | 29875 | 14207.44860 | 59 | BAT1930365 | 16.034631 | B | D1 | MORTGAGE | ... | 1.856480 | 0.366386 | 0 | INDIVIDUAL | 54 | 0 | 47 | 196960 | 66060 | 0 |
67463 rows × 35 columns
# Use fit and transform method
nc = ss.fit_transform(data.loc[:,cols])
# Transform numpy array back to pandas dataframe
# as we will be using pandas plotting functions
nc = pd.DataFrame(nc, columns = cols)
nc.head(2)
| Loan Amount | Funded Amount | Total Accounts | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Funded Amount Investor | Interest Rate | Recoveries | Collection Recovery Fee | Accounts Delinquent | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.818483 | 2.020064 | -1.397725 | -0.155120 | 1.091309 | -0.789041 | -0.319281 | -0.337854 | -0.191268 | -0.160195 | -0.094966 | 0.0 |
| 1 | -1.582243 | -0.469958 | -0.676500 | -0.125565 | 0.165689 | -0.106997 | -0.319281 | -0.358098 | 0.105229 | -0.160534 | -0.043073 | 0.0 |
# Add/overwrite few columns that are discrete
# These columns were not to be scaled
nc['Loan Status'] = data['Loan Status']
nc['Accounts Delinquent'] = data['Accounts Delinquent']
# Parallel coordinates chart
fig1 = plt.figure()
pd.plotting.parallel_coordinates(nc,
'Loan Status', # class_column
colormap='winter'
)
plt.xticks(rotation=90)
plt.title("Parallel chart with data")
<AxesSubplot:>
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), [Text(0, 0, 'Loan Amount'), Text(1, 0, 'Funded Amount'), Text(2, 0, 'Total Accounts'), Text(3, 0, 'Total Collection Amount'), Text(4, 0, 'Total Current Balance'), Text(5, 0, 'Total Revolving Credit Limit'), Text(6, 0, 'Funded Amount Investor'), Text(7, 0, 'Interest Rate'), Text(8, 0, 'Recoveries'), Text(9, 0, 'Collection Recovery Fee'), Text(10, 0, 'Accounts Delinquent')])
Text(0.5, 1.0, 'Parallel chart with data')
# Andrews charts
fig3 = plt.figure()
pd.plotting.andrews_curves(nc,
'Loan Status',
colormap = 'winter')
plt.title("Andrews plots with data")
<AxesSubplot:>
Text(0.5, 1.0, 'Andrews plots with data')
# Radviz plot
fig5 = plt.figure()
pd.plotting.radviz(nc,
class_column ='Loan Status',
colormap= plt.cm.winter,
alpha = 0.4
)
<AxesSubplot:>